Partially revert I61dc536 that broke phrase search support.
Fix phrase search by making explicit that there are two
kind of legalSearchChars() usecases :
- the chars allowed to be part of the search query (including special
syntax chars such as " and *). Used by SearchDatabase::filter() to
cleanup the whole query string (the default).
- the chars allowed to be part of a search term (excluding special
syntax chars) Used by search engine implementaions when parsing with
a regex.
For future reference:
Originally this distinction was made "explicit" by calling directly
SearchEngine::legalSearchChars() during the parsing stage. This was
broken by Iaabc10c by enabling inheritance.
This patch adds a new optional param to legalSearchChars to make this
more explicit.
Also remove the function I introduced in I61dc536 (I wrongly assumed
that the disctinction made between legalSearchChars usecases was due
to a difference in behavior between indexing and searching).
Added more tests to prevent this from happening in the future.
Bug: T167798
Change-Id: Ibdc796bb2881a2ed8194099d8c9f491980010f0f
# Language-specific strip/conversion
$text = $wgContLang->normalizeForSearch( $text );
$se = $se ?: MediaWikiServices::getInstance()->newSearchEngine();
- $lc = $se->legalSearchCharsForUpdate() . '&#;';
+ $lc = $se->legalSearchChars() . '&#;';
$text = preg_replace( "/<\\/?\\s*[A-Za-z][^>]*?>/",
' ', $wgContLang->lc( " " . $text . " " ) ); # Strip HTML markup
$ns = $this->title->getNamespace();
$title = $this->title->getText();
- $lc = $search->legalSearchCharsForUpdate() . '&#;';
+ $lc = $search->legalSearchChars() . '&#;';
$t = $wgContLang->normalizeForSearch( $title );
$t = preg_replace( "/[^{$lc}]+/", ' ', $t );
$t = $wgContLang->lc( $t );
* @return string
*/
protected function filter( $text ) {
- $lc = $this->legalSearchChars();
+ // List of chars allowed in the search query.
+ // This must include chars used in the search syntax.
+ // Usually " (phrase) or * (wildcards) if supported by the engine
+ $lc = $this->legalSearchChars( self::CHARS_ALL );
return trim( preg_replace( "/[^{$lc}]/", " ", $text ) );
}
}
/** @const string profile type for query independent ranking features */
const FT_QUERY_INDEP_PROFILE_TYPE = 'fulltextQueryIndepProfile';
+ /** @const int flag for legalSearchChars: includes all chars allowed in a search query */
+ const CHARS_ALL = 1;
+
+ /** @const int flag for legalSearchChars: includes all chars allowed in a search term */
+ const CHARS_NO_SYNTAX = 2;
+
/**
* Perform a full text search query and return a result set.
* If full text searches are not supported or disabled, return null.
}
/**
- * Get chars legal for search (at query time).
+ * Get chars legal for search
* NOTE: usage as static is deprecated and preserved only as BC measure
+ * @param int $type type of search chars (see self::CHARS_ALL
+ * and self::CHARS_NO_SYNTAX). Defaults to CHARS_ALL
* @return string
*/
- public static function legalSearchChars() {
+ public static function legalSearchChars( $type = self::CHARS_ALL ) {
return "A-Za-z_'.0-9\\x80-\\xFF\\-";
}
- /**
- * Get chars legal for search (at index time).
- *
- * @since 1.30
- * @return string
- */
- public function legalSearchCharsForUpdate() {
- return static::legalSearchChars();
- }
-
/**
* Set the maximum number of results to return
* and how many to skip before returning the first.
*/
function parseQuery( $filteredText, $fulltext ) {
global $wgContLang;
- $lc = $this->legalSearchChars();
+ $lc = $this->legalSearchChars( self::CHARS_NO_SYNTAX );
$this->searchTerms = [];
# @todo FIXME: This doesn't handle parenthetical expressions.
function parseQuery( $filteredText, $fulltext ) {
global $wgContLang;
- $lc = $this->legalSearchChars(); // Minus format chars
+ $lc = $this->legalSearchChars( self::CHARS_NO_SYNTAX ); // Minus syntax chars (" and *)
$searchon = '';
$this->searchTerms = [];
return $regex;
}
- public function legalSearchCharsForUpdate() {
- return "\"*" . parent::legalSearchCharsForUpdate();
+ public static function legalSearchChars( $type = self::CHARS_ALL ) {
+ $searchChars = parent::legalSearchChars( $type );
+ if ( $type === self::CHARS_ALL ) {
+ // " for phrase, * for wildcard
+ $searchChars = "\"*" . $searchChars;
+ }
+ return $searchChars;
}
/**
*/
function parseQuery( $filteredText, $fulltext ) {
global $wgContLang;
- $lc = $this->legalSearchChars();
+ $lc = $this->legalSearchChars( self::CHARS_NO_SYNTAX );
$this->searchTerms = [];
# @todo FIXME: This doesn't handle parenthetical expressions.
[] );
}
- public function legalSearchCharsForUpdate() {
- return "\"" . parent::legalSearchCharsForUpdate();
+ public static function legalSearchChars( $type = self::CHARS_ALL ) {
+ $searchChars = parent::legalSearchChars( $type );
+ if ( $type === self::CHARS_ALL ) {
+ $searchChars = "\"" . $searchChars;
+ }
+ return $searchChars;
}
}
*/
function parseQuery( $filteredText, $fulltext ) {
global $wgContLang;
- $lc = $this->legalSearchChars(); // Minus format chars
+ $lc = $this->legalSearchChars( self::CHARS_NO_SYNTAX ); // Minus syntax chars (" and *)
$searchon = '';
$this->searchTerms = [];
return $regex;
}
- public function legalSearchCharsForUpdate() {
- return "\"*" . parent::legalSearchCharsForUpdate();
+ public static function legalSearchChars( $type = self::CHARS_ALL ) {
+ $searchChars = parent::legalSearchChars( $type );
+ if ( $type === self::CHARS_ALL ) {
+ // " for phrase, * for wildcard
+ $searchChars = "\"*" . $searchChars;
+ }
+ return $searchChars;
}
/**
"Plain search" );
}
+ public function testWildcardSearch() {
+ $res = $this->search->searchText( 'smith*' );
+ $this->assertEquals(
+ [ 'Smithee' ],
+ $this->fetchIds( $res ),
+ "Search with wildcards" );
+
+ $res = $this->search->searchText( 'smithson*' );
+ $this->assertEquals(
+ [],
+ $this->fetchIds( $res ),
+ "Search with wildcards must not find unrelated articles" );
+
+ $res = $this->search->searchText( 'smith* smithee' );
+ $this->assertEquals(
+ [ 'Smithee' ],
+ $this->fetchIds( $res ),
+ "Search with wildcards can be combined with simple terms" );
+
+ $res = $this->search->searchText( 'smith* "one who smiths"' );
+ $this->assertEquals(
+ [ 'Smithee' ],
+ $this->fetchIds( $res ),
+ "Search with wildcards can be combined with phrase search" );
+ }
+
public function testPhraseSearch() {
$res = $this->search->searchText( '"smithee is one who smiths"' );
$this->assertEquals(
[ 'Smithee' ],
$this->fetchIds( $res ),
"Search a phrase" );
- $res = $this->search->searchText( '"smithee is one who smiths"' );
+
+ $res = $this->search->searchText( '"smithee is who smiths"' );
+ $this->assertEquals(
+ [],
+ $this->fetchIds( $res ),
+ "Phrase search is not sloppy, search terms must be adjacent" );
+
+ $res = $this->search->searchText( '"is smithee one who smiths"' );
+ $this->assertEquals(
+ [],
+ $this->fetchIds( $res ),
+ "Phrase search is ordered" );
+ }
+
+ public function testPhraseSearchHighlight() {
+ $phrase = "smithee is one who smiths";
+ $res = $this->search->searchText( "\"$phrase\"" );
$match = $res->next();
- $terms = [ 'smithee', 'is', 'one', 'who', 'smiths' ];
- $snippet = "";
- foreach ( $terms as $term ) {
- $snippet .= " <span class='searchmatch'>" . $term . "</span>";
- }
- $this->assertRegexp( '/' . preg_quote( $snippet, '/' ) . '/',
+ $snippet = "A <span class='searchmatch'>" . $phrase . "</span>";
+ $this->assertStringStartsWith( $snippet,
$match->getTextSnippet( $res->termMatches() ),
"Highlight a phrase search" );
}